{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use Closed-Form Policy to Play BipedalWalker-v3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "D:\\ProgramData\\Anaconda3\\lib\\site-packages\\ale_py\\roms\\__init__.py:94: DeprecationWarning: Automatic importing of atari-py roms won't be supported in future releases of ale-py. Please migrate over to using `ale-import-roms` OR an ALE-supported ROM package. To make this warning disappear you can run `ale-import-roms --import-from-pkg atari_py.atari_roms`.For more information see: https://github.com/mgbellemare/Arcade-Learning-Environment#rom-management\n",
      "  _RESOLVED_ROMS = _resolve_roms()\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import logging\n",
    "import itertools\n",
    "\n",
    "import numpy as np\n",
    "np.random.seed(0)\n",
    "import gym\n",
    "\n",
    "logging.basicConfig(level=logging.DEBUG,\n",
    "        format='%(asctime)s [%(levelname)s] %(message)s',\n",
    "        stream=sys.stdout, datefmt='%H:%M:%S')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "23:34:56 [INFO] env: <BipedalWalker<BipedalWalker-v3>>\n",
      "23:34:56 [INFO] _action_space: None\n",
      "23:34:56 [INFO] _observation_space: None\n",
      "23:34:56 [INFO] _reward_range: None\n",
      "23:34:56 [INFO] _metadata: None\n",
      "23:34:56 [INFO] _max_episode_steps: 1600\n",
      "23:34:56 [INFO] _elapsed_steps: None\n",
      "23:34:56 [INFO] id: BipedalWalker-v3\n",
      "23:34:56 [INFO] entry_point: gym.envs.box2d:BipedalWalker\n",
      "23:34:56 [INFO] reward_threshold: 300\n",
      "23:34:56 [INFO] nondeterministic: False\n",
      "23:34:56 [INFO] max_episode_steps: 1600\n",
      "23:34:56 [INFO] order_enforce: True\n",
      "23:34:56 [INFO] _kwargs: {}\n",
      "23:34:56 [INFO] _env_name: BipedalWalker\n"
     ]
    }
   ],
   "source": [
    "env = gym.make('BipedalWalker-v3')\n",
    "env.seed(0)\n",
    "for key in vars(env):\n",
    "    logging.info('%s: %s', key, vars(env)[key])\n",
    "for key in vars(env.spec):\n",
    "    logging.info('%s: %s', key, vars(env.spec)[key])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class ClosedFormAgent:\n",
    "    def __init__(self, env):\n",
    "        self.weights = np.array([\n",
    "            [ 0.9, -0.7,  0.0, -1.4],\n",
    "            [ 4.3, -1.6, -4.4, -2.0],\n",
    "            [ 2.4, -4.2, -1.3, -0.1],\n",
    "            [-3.1, -5.0, -2.0, -3.3],\n",
    "            [-0.8,  1.4,  1.7,  0.2],\n",
    "            [-0.7,  0.2, -0.2,  0.1],\n",
    "            [-0.6, -1.5, -0.6,  0.3],\n",
    "            [-0.5, -0.3,  0.2,  0.1],\n",
    "            [ 0.0, -0.1, -0.1,  0.1],\n",
    "            [ 0.4,  0.8, -1.6, -0.5],\n",
    "            [-0.4,  0.5, -0.3, -0.4],\n",
    "            [ 0.3,  2.0,  0.9, -1.6],\n",
    "            [ 0.0, -0.2,  0.1, -0.3],\n",
    "            [ 0.1,  0.2, -0.5, -0.3],\n",
    "            [ 0.7,  0.3,  5.1, -2.4],\n",
    "            [-0.4, -2.3,  0.3, -4.0],\n",
    "            [ 0.1, -0.8,  0.3,  2.5],\n",
    "            [ 0.4, -0.9, -1.8,  0.3],\n",
    "            [-3.9, -3.5,  2.8,  0.8],\n",
    "            [ 0.4, -2.8,  0.4,  1.4],\n",
    "            [-2.2, -2.1, -2.2, -3.2],\n",
    "            [-2.7, -2.6,  0.3,  0.6],\n",
    "            [ 2.0,  2.8,  0.0, -0.9],\n",
    "            [-2.2,  0.6,  4.7, -4.6],\n",
    "            ])\n",
    "        self.bias = np.array([3.2, 6.1, -4.0, 7.6])\n",
    "\n",
    "    def reset(self, mode=None):\n",
    "        pass\n",
    "\n",
    "    def step(self, observation, _reward, _done):\n",
    "        action = np.matmul(observation, self.weights) + self.bias\n",
    "        return action\n",
    "\n",
    "    def close(self):\n",
    "        pass\n",
    "\n",
    "\n",
    "agent = ClosedFormAgent(env)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "23:34:56 [INFO] ==== test ====\n",
      "23:35:32 [DEBUG] test episode 0: reward = 312.65, steps = 1248\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-4-0f65c6ccb8f6>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     21\u001b[0m \u001b[0mepisode_rewards\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     22\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mepisode\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m100\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 23\u001b[1;33m     \u001b[0mepisode_reward\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0melapsed_steps\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mplay_episode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0menv\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0magent\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mrender\u001b[0m\u001b[1;33m=\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     24\u001b[0m     \u001b[0mepisode_rewards\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mepisode_reward\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     25\u001b[0m     logging.debug('test episode %d: reward = %.2f, steps = %d',\n",
      "\u001b[1;32m<ipython-input-4-0f65c6ccb8f6>\u001b[0m in \u001b[0;36mplay_episode\u001b[1;34m(env, agent, max_episode_steps, mode, render)\u001b[0m\n\u001b[0;32m      6\u001b[0m         \u001b[0maction\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0magent\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mobservation\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreward\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      7\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mrender\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m             \u001b[0menv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrender\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      9\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mdone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     10\u001b[0m             \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\gym\\core.py\u001b[0m in \u001b[0;36mrender\u001b[1;34m(self, mode, **kwargs)\u001b[0m\n\u001b[0;32m    293\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    294\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mrender\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m\"human\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 295\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0menv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrender\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmode\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    296\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    297\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\gym\\envs\\box2d\\bipedal_walker.py\u001b[0m in \u001b[0;36mrender\u001b[1;34m(self, mode)\u001b[0m\n\u001b[0;32m    562\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mviewer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdraw_polyline\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mf\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolor\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mlinewidth\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    563\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 564\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mviewer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrender\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mreturn_rgb_array\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmode\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"rgb_array\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    565\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    566\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\gym\\envs\\classic_control\\rendering.py\u001b[0m in \u001b[0;36mrender\u001b[1;34m(self, return_rgb_array)\u001b[0m\n\u001b[0;32m    143\u001b[0m             \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwidth\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m4\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    144\u001b[0m             \u001b[0marr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 145\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwindow\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    146\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0monetime_geoms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    147\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0marr\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mreturn_rgb_array\u001b[0m \u001b[1;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0misopen\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mD:\\ProgramData\\Anaconda3\\lib\\site-packages\\pyglet\\window\\win32\\__init__.py\u001b[0m in \u001b[0;36mflip\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    354\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_always_dwm\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_dwm_composition_enabled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    355\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_interval\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 356\u001b[1;33m                     \u001b[0m_dwmapi\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mDwmFlush\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    357\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    358\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontext\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):\n",
    "    observation, reward, done = env.reset(), 0., False\n",
    "    agent.reset(mode=mode)\n",
    "    episode_reward, elapsed_steps = 0., 0\n",
    "    while True:\n",
    "        action = agent.step(observation, reward, done)\n",
    "        if render:\n",
    "            env.render()\n",
    "        if done:\n",
    "            break\n",
    "        observation, reward, done, _ = env.step(action)\n",
    "        episode_reward += reward\n",
    "        elapsed_steps += 1\n",
    "        if max_episode_steps and elapsed_steps >= max_episode_steps:\n",
    "            break\n",
    "    agent.close()\n",
    "    return episode_reward, elapsed_steps\n",
    "\n",
    "\n",
    "logging.info('==== test ====')\n",
    "episode_rewards = []\n",
    "for episode in range(100):\n",
    "    episode_reward, elapsed_steps = play_episode(env, agent,render= 1)\n",
    "    episode_rewards.append(episode_reward)\n",
    "    logging.debug('test episode %d: reward = %.2f, steps = %d',\n",
    "            episode, episode_reward, elapsed_steps)\n",
    "logging.info('average episode reward = %.2f ± %.2f',\n",
    "        np.mean(episode_rewards), np.std(episode_rewards))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
